#!/usr/bin/env python
# encoding: utf-8
"""
@summary: 计算7月份的日活跃用户分布情况
@author: hongxingfan
@since: 2014年8月20日	上午9:51:12
"""
import sys
import os

# 输入路径hadoop dfs -cat /user/autolog/tmp/fan/ActivityUser_07/pa* | more 导到本地
# 输入文件格式：1306181222501951	{20140714:1;2014-07-06:2;20140720:1;}

if __name__ == "__main__":
	filePath = sys.argv[1]
	result_hash = {}

	num = 0

	if os.path.exists(filePath):
		handler = open(filePath, "r")
		for line in handler:
			num += 1
			if num % 300000 == 0:
				print("已处理 %d 条记录" % (num))

			line = line.strip()
			cols = line.split("\t")
			length = len(cols[1])
			values = cols[1][1:length - 2]  # 子串 如：2014-07-08:6;2014-07-17:31
			values_1 = values.split(";")  # 2014-07-08:6  2014-07-17:31
			for val in values_1:
				index = val.index(":")
				key = val[0:index]  # 2014-07-08
				if key in result_hash:
					result_hash[key] += 1
				else:
					result_hash[key] = 1
	else:
		sys.stderr.write("'%s' is an error path!" % (filePath))
		sys.exit(0)

	# 输出
	for (k, v) in result_hash.items():
		print("%s\t%d" % (k, v))
